library(RSQLite)
library(data.table)
library(stringr)
library(splitstackshape)
library(plyr)
library(dplyr)
library(foreign)
library(stringr)
require("stringi")
require(stringdist)
require(RecordLinkage)
library(foreign)
library(stringdist)
library(doParallel)
rm(list=ls())

######################################################################################################
### Aim: Place I-CEM raw data into sqlite
### Datasets used:  - EW51outputfile.txt 
###                 - icemnamesew51_protect.txt
###                 - EW81outputfile.txt
###                 - icemnamesew81_protect.txt
### Datasets created: - icem.sqlite 
######################################################################################################

##################
## CENSUS 1851 ###
##################

var.names <- c("Year", "RecID", "ParID", "ConParID", "Country",	"Division",	"RegCnty", "RegDist",	"SubDist", "RC", "RD", "RSD",	"Parish",	
               "Area",	"Part",	"Population",	"MalePop", "FemalePop",	"NoOfInstit",	"InstitPop","ParType", "Censusref",	"ImageRef",	"PageType",	
               "DocType", "EnuDist",	"BuildType","BTCode",	"NoOfRooms", "NoOfRoomsCode",	"Schedule",	"H",	"Absent", "Absentcode",	"HSS", "Size",	
               "InstName",	"InstDesc","VessName","VessPos","PID","Sex","SexInf","Age","Cage","AgeInf",	"Cond",	"Mar","MarInf",	"Relat","Rela",	
               "RelInf",	"HeadInf","Occ", "HollerOcc",	"Occode",	"HISCO","Industry",	"HollerInd","Employ",	"EmployCode",	"AtHome",	"Inactive",	
               "Disab", "DisCode1", "DisCode2","Bpstring",	"BpCmty", "Std_Par","BpCnty",	"Cnti",	"Alt_Cnti",	"BpCtry",	"Ctry",	"Alt_Ctry",	"HollerB",
               "Nationality","Lang",	"Langcode",	"YearsMar","MarYear",	"ChildTot",	"ChildAlive",	"ChildDead","ChildrenCode",	"HHD",
               "H_Sname","H_Sex","H_Age","H_Rela",	"H_Mar","H_Occ","H_CFU","SameName","CFU",	"n_CFUs",	"tn_CFUs","CFUsize","Spouse",	"Father",	"Mother",	
               "f_Off","m_Off",	"m_Offm",	"f_Offm",	"Offsp",	"Kids",	"Relats",	"Inmates",	"Servts",	"Non_Rels",	"Visitors",	"Military")

n = 100000 

f = file("EW51outputfile.txt") 
con = open(f) 
data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
names(data) <- var.names   

con_data = dbConnect(SQLite(), dbname="icem.sqlite")

while(nrow(data) == n) { # if not reached the end of line
  dbWriteTable(con_data, data, name="census1851", append=TRUE)
  data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
  names(data) <- var.names      
  print(n)
}

close(f)

var.names <- c("Year", "RecID", "Title", "Pname", "Oname", "Sname", "Address")
n = 100000 
f = file("icemnamesew51_protect.txt") 
con = open(f) 
data <-read.table(f, nrows=n, header=TRUE, sep="\t", fill=TRUE, quote='')
names(data) <- var.names   

con_data = dbConnect(SQLite(), dbname="icem.sqlite")

while(nrow(data) == n) { 
  dbWriteTable(con_data, data, name="census1851b", append=TRUE)
  data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
  names(data) <- var.names      
  print(n)
}

dbDisconnect(con_data)

##################
## CENSUS 1881 ###
##################

var.names <- c("Year","RecID","ParID","ConParID","Country","Division","RegCnty","RegDist","SubDist","RC","RD","RSD","Parish","Area","Part","Population",
               "MalePop","FemalePop","NoOfInstit","InstitPop","ParType","Censusref","ImageRef","PageType","DocType","EnuDist","BuildType","BTCode","NoOfRooms","NoOfRoomsCode",
               "Schedule","H","Absent","Absentcode","HSS","Size","InstName","InstDesc","VessName","VessPos","PID","Sex","SexInf","Age","Cage","AgeInf","Cond","Mar","MarInf",
               "Relat","Rela","RelInf","HeadInf","Occ","HollerOcc","Occode","HISCO","Industry","HollerInd","Employ","EmployCode","AtHome","Inactive","Disab","DisCode1","DisCode2",
               "Bpstring","BpCmty","Std_Par","BpCnty","Cnti","Alt_Cnti","BpCtry","Ctry","Alt_Ctry","HollerB","Nationality","Lang","Langcode","YearsMar","MarYear","ChildTot","ChildAlive",
               "ChildDead","ChildrenCode","HHD","H_Sname","H_Sex","H_Age","H_Rela","H_Mar","H_Occ","H_CFU","SameName","CFU","n_CFUs","tn_CFUs","CFUsize","Spouse","Father","Mother",
               "f_Off","m_Off","m_Offm","f_Offm","Offsp","Kids","Relats","Inmates","Servts","Non_Rels","Visitors","Military")

n = 100000 

f = file("EW81outputfile.txt") 
con = open(f) 
data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
names(data) <- var.names   

con_data = dbConnect(SQLite(), dbname="icem.sqlite")

while(nrow(data) == n) { # if not reached the end of line
  dbWriteTable(con_data, data, name="census1881", append=TRUE) #write to sqlite 
  data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
  names(data) <- var.names      
  print(n)
}

close(f)

var.names <- c("Year", "RecID", "Title", "Pname", "Oname", "Sname", "Address")
n = 100000 
f = file("icemnamesew81_protect.txt") 
con = open(f) 
data <-read.table(f, nrows=n, header=TRUE, sep="\t", fill=TRUE, quote='')
names(data) <- var.names   

con_data = dbConnect(SQLite(), dbname="icem.sqlite")

while(nrow(data) == n) { 
  dbWriteTable(con_data, data, name="census1881b", append=TRUE)
  data <-read.table(f, nrows=n, header=FALSE, sep="\t", fill=TRUE, quote='')
  names(data) <- var.names      
  print(n)
}

dbDisconnect(con_data)